#!/usr/bin/env python
# coding: utf-8

# In[1]:


exec(open('init_path.py').read())
exec((P_Lib/'GasStation.py').read_text())
get_ipython().run_line_magic('matplotlib', 'inline')


# ### Further Cleanup of Price History Data

# In[2]:


[f.unlink() for f in (P_GS_Data_Raw / 'PH_Day').glob('*.h5')]; # Clear up daily folder


# In[3]:
# In[4]:


# 1. replace ID by StID
# 2. separate files by each day
dict_gsid_to_stid = load_obj(P_GS_Data / 'GS' / 'dict_gsid_to_stid.pkl')
files = list((P_GS_Data_Raw / 'Raw_HDF').glob('*.h5'))
files = sorted(files, key=lambda f: int(f.name.split('.')[0]))


# In[5]:


def clean_up_price_history_data(f):
    # f = files[0]
    ymd = int(f.name.split('.')[0])
    df = read_hdf(f, 'GS')
#     df['YMD'] = df.Time.dt.year*10000 + df.Time.dt.month*100 + df.Time.dt.day
    df['StID'] = df.ID.map(dict_gsid_to_stid)
    # Manual Error Correction
    # 1. StID 0 has too many wrongly designated -1 (every 4 mins) & bsaically no PC => remove it
    df = df.loc[df.StID!=0, ['StID','Time','diesel','e5','e10']].sort_values(by=['StID','Time']).reset_index(drop=True)
    df.to_hdf(P_GS_Data_Raw / 'PH_Day' / f.name, 'GS', mode='w', complevel=9, complib='blosc')

with Pool(6) as p:
    ls = p.map(clean_up_price_history_data, files)


# In[61]:

